#Required libraries
# Tidyverse for data science and exploration
require(dplyr)
Loading required package: dplyr
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
require(tidyr)
Loading required package: tidyr
require(readr)
Loading required package: readr
require(tibble)
Loading required package: tibble
require(stringr)
Loading required package: stringr
require(purrr)
Loading required package: purrr
require(forcats)
Loading required package: forcats
require(rlang)
Loading required package: rlang
Attaching package: ‘rlang’
The following objects are masked from ‘package:purrr’:
%@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl, flatten_raw, invoke, list_along,
modify, prepend, splice
# enhances tidyverse
require(tidylog) # additional logging
Loading required package: tidylog
Attaching package: ‘tidylog’
The following objects are masked from ‘package:tidyr’:
drop_na, fill, gather, pivot_longer, pivot_wider, replace_na, spread, uncount
The following objects are masked from ‘package:dplyr’:
add_count, add_tally, anti_join, count, distinct, distinct_all, distinct_at, distinct_if, filter, filter_all,
filter_at, filter_if, full_join, group_by, group_by_all, group_by_at, group_by_if, inner_join, left_join, mutate,
mutate_all, mutate_at, mutate_if, relocate, rename, rename_all, rename_at, rename_if, rename_with, right_join,
sample_frac, sample_n, select, select_all, select_at, select_if, semi_join, slice, slice_head, slice_max,
slice_min, slice_sample, slice_tail, summarise, summarise_all, summarise_at, summarise_if, summarize,
summarize_all, summarize_at, summarize_if, tally, top_frac, top_n, transmute, transmute_all, transmute_at,
transmute_if, ungroup
The following object is masked from ‘package:stats’:
filter
require(magrittr) # additional data pipe syntax
Loading required package: magrittr
Attaching package: ‘magrittr’
The following object is masked from ‘package:rlang’:
set_names
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
# for reading data in multiple formats
require(readxl)
Loading required package: readxl
require(haven)
Loading required package: haven
# visual analysis
require(ggplot2)
Loading required package: ggplot2
Want to understand how all the pieces fit together? Read R for Data Science: https://r4ds.had.co.nz/
require(GGally) # extensions to ggplot
Loading required package: GGally
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
require(gt) # well formatted tables
Loading required package: gt
# client-side interactive publishable graphics
require(plotly)
Loading required package: plotly
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following objects are masked from ‘package:tidylog’:
distinct, filter, group_by, mutate, rename, select, slice, summarise, transmute, ungroup
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
require(leaflet)
Loading required package: leaflet
require(crosstalk)
Loading required package: crosstalk
require(htmlwidgets)
Loading required package: htmlwidgets
# server-side interactive graphics
require(shiny)
Loading required package: shiny
Attaching package: ‘shiny’
The following object is masked from ‘package:crosstalk’:
getDefaultReactiveDomain
require(shinyjs)
Loading required package: shinyjs
Find out advanced usage of shinyjs:
https://deanattali.com/shinyjs/advanced
Attaching package: ‘shinyjs’
The following object is masked from ‘package:shiny’:
runExample
The following object is masked from ‘package:gt’:
html
The following objects are masked from ‘package:methods’:
removeClass, show
# Canned Interactive EDA
require(ExPanDaR)
Loading required package: ExPanDaR
Exploring KU Book Processing Charges
# read KU data frame
KUbpc.df <- read_csv("Public Data/openapc-de/data/bpc.csv")
Parsed with column specification:
cols(
institution = col_character(),
period = col_double(),
euro = col_double(),
doi = col_character(),
backlist_oa = col_logical(),
publisher = col_character(),
book_title = col_character(),
isbn = col_character(),
isbn_print = col_character(),
isbn_electronic = col_character(),
license_ref = col_character(),
indexed_in_crossref = col_logical(),
doab = col_logical()
)
# read DOAB metadata
source('Public Data/DOAB/doabingest.R')
DOABmeta.df <- doabFetch()
embedded nul(s) found in input
head(KUbpc.df)
head(summary(KUbpc.df))
institution period euro doi backlist_oa publisher book_title
Length:938 Min. :2017 Min. :1075 Length:938 Mode :logical Length:938 Length:938
Class :character 1st Qu.:2017 1st Qu.:1875 Class :character FALSE:357 Class :character Class :character
Mode :character Median :2018 Median :1981 Mode :character TRUE :581 Mode :character Mode :character
Mean :2018 Mean :4368
3rd Qu.:2019 3rd Qu.:8250
Max. :2020 Max. :8978
isbn isbn_print isbn_electronic license_ref indexed_in_crossref doab
Length:938 Length:938 Length:938 Length:938 Mode :logical Mode :logical
Class :character Class :character Class :character Class :character FALSE:127 FALSE:44
Mode :character Mode :character Mode :character Mode :character TRUE :811 TRUE :894
ggplot(data = KUbpc.df, aes(KUbpc.df$institution)) + geom_bar()

ggplot(data = KUbpc.df, aes(KUbpc.df$euro)) + geom_histogram()

General Exploratory Data Analysis
ggplot(data = KUbpc.df) + geom_bar(mapping = aes(x = KUbpc.df$doab))

# Date to Doab
date_doab <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$period, colour = KUbpc.df$doab)) + geom_freqpoly(binwidth = 0.1)
ggplotly(date_doab)
# publisher_euro <- KUbpc.df %>%
# ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$publisher, colour = KUbpc.df$euro)) + geom_freqpoly(binwidth = 0.1)
# Institution to Euro
institution_euro <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$euro)) + geom_freqpoly(mapping = aes(colour = KUbpc.df$institution), binwidth = 500)
ggplotly(institution_euro)
NA
Idea: Publishers vs. Charges
Question: How do the top 25% of publishers divide up charges (in Euro)?
Observation: Charges are grouped around ~2000 Euros and ~8000 Euros.
publisher_counts <- KUbpc.df %>%
group_by(publisher) %>%
tally
tally: now 110 rows and 2 columns, ungrouped
sorted_counts = arrange(publisher_counts, desc(n))
total_n = sum(sorted_counts$n)
quarter_n = 0.25 * total_n
new_n = sum(sorted_counts$n[0:6])
sorted_counts %>% filter(n > 24)
# filtered <- filter(KUbpc.df$publisher %in% sorted_counts$publisher)
filtered <- filter(KUbpc.df, KUbpc.df$publisher == 'transcript Verlag' |
KUbpc.df$publisher == 'Duke University Press' |
KUbpc.df$publisher == 'University of Michigan Press' |
KUbpc.df$publisher == 'Manchester University Press' |
KUbpc.df$publisher == 'Pluto Press' |
KUbpc.df$publisher == 'Liverpool University Press')
head(filtered)
euro_publisher <- filtered %>%
ggplot(data = filtered, mapping = aes(x = filtered$publisher, y = filtered$euro),
aes(x = filtered$publisher, y = filtered$euro)) +
# geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) +
geom_count(aes(color = ..n.., group = euro)) +
scale_size_area(max_size = 10) +
theme(axis.text = element_text(size = rel(0.75))) +
labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
# ggplot:
ggplotly(euro_publisher)
`group_by_()` is deprecated as of dplyr 0.7.0.
Please use `group_by()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$publisher, y = filtered$euro)) +
geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) +
labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
cross_ft <- bscols(
filter_select("publisher", "Select a publisher", ft, ~publisher),
ggplotly(gg_ft, dynamicTicks = TRUE),
widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)
# shared_euro_publisher <- SharedData$new(filtered)
# leaflet(shared_euro_publisher) %>% addMarkers()
# data.table::data.table(shared_euro_publisher)
Idea: Publishers’ Charges vs. Year/OA Type
Sub-Question: What best explains the particular division of charges? (Year, OA Type)
Observation: The low and high charge groups seem to be defined by the type of OA business model, whereas the slight differences within each group seem to be defined by the year.
head(filtered)
# Does Type of OA impact the particular division of charges?
euro_oa_publisher <- filtered %>%
ggplot(data = filtered, mapping = aes(x = filtered$backlist_oa, y = filtered$euro),
aes(x = filtered$backlist_oa, y = filtered$euro)) +
geom_count(aes(color = ..n.., group = euro)) +
scale_size_area(max_size = 10) +
theme(axis.text = element_text(size = rel(0.75))) +
labs(title = "How OA Impacts Price Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')
# ggplot:
ggplotly(euro_oa_publisher)
# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$backlist_oa, y = filtered$euro)) +
geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) +
labs(title = "How OA Impacts Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
filter_select("publisher", "Select a publisher", ft, ~publisher),
ggplotly(gg_ft, dynamicTicks = TRUE),
widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)
# Does Year impact the particular division of charges?
euro_year_publisher <- filtered %>%
ggplot(data = filtered, mapping = aes(x = filtered$period, y = filtered$euro),
aes(x = filtered$period, y = filtered$euro)) +
geom_count(aes(color = ..n.., group = euro)) +
scale_size_area(max_size = 10) +
theme(axis.text = element_text(size = rel(0.75))) +
labs(title = "How Year Impacts Price Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')
# ggplot:
ggplotly(euro_year_publisher)
# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$period, y = filtered$euro)) +
geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) +
labs(title = "How Year Impacts Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
filter_select("publisher", "Select a publisher", ft, ~publisher),
ggplotly(gg_ft, dynamicTicks = TRUE),
widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)
NA
NA
Idea: Publishers vs. OA
Question: What type of business model do the top 25% publishers use?
Observation: Most have a higher proportion of True (moved to OA from traditional publishing) than False (already published OA).
oa_type <- filtered %>%
ggplot(data = filtered, mapping = aes(x = filtered$publisher, colour = filtered$backlist_oa), fill = filtered$backlist_oa) +
geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion", color = 'Types of OA') +
theme(axis.text = element_text(size = rel(0.75))) +
scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
scale_color_brewer(palette = "Set1")
ggplotly(oa_type)
# crosstalk:
ft <- highlight_key(filtered)
oa_ft <- ggplot(data = ft, mapping = aes(x = ft$publisher, colour = ft$backlist_oa), fill = ft$backlist_oa) +
geom_bar(position = "fill", width = 0.7) +
labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion of Backlist OA", color = 'Types of OA') +
theme(axis.text = element_text(size = rel(0.75))) +
scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
# cross_oa_ft <- bscols(
# filter_select("publisher", "Select a publisher", ft, ~publisher),
# ggplotly(oa_ft, dynamicTicks = TRUE),
# # widths = c(12, 12)
# )
# bscols(cross_oa_ft)
Idea: Publishers’ OA vs. Year
Question: Did OA business models of the top 25% publishers change per year?
Observation:
oa_time <- function(pub_name) {
pub_ft <- filter(filtered, filtered$publisher == pub_name)
pub_oa <- pub_ft %>%
ggplot(data = pub_ft, mapping = aes(x = pub_ft$period, colour = pub_ft$backlist_oa), fill = pub_ft$backlist_oa) +
geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
labs(title = paste(pub_name, "'s OA Through the Years", sep = ""),
x = "Years", y = "Proportion of Backlist OA", color = 'Types of OA') +
theme(axis.text = element_text(size = rel(0.75))) +
scale_x_discrete(limits=c(2017, 2018, 2019)) +
scale_color_brewer(palette = "Set1")
ggplotly(pub_oa)
}
top25_list = c("transcript Verlag", "Duke University Press", "University of Michigan Press", "Manchester University Press", "Pluto Press", "Liverpool University Press")
oa_time("transcript Verlag")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
oa_time("Duke University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
oa_time("University of Michigan Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
oa_time("Manchester University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
oa_time("Pluto Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
oa_time("Liverpool University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
Idea: Revenue vs. OA
Question: What total revenue are publishers receiving each year?
Observation:
# Finding total revenue for each publisher
revenue_finder <- function(pub_name) {
pub_filtered <- filter(filtered, filtered$publisher == pub_name)
rev = sum(pub_filtered$euro)
}
revenue_df <- data.frame("publisher" = top25_list)
revenue_list <- c()
for (i in top25_list) {
revenue_list<-c(revenue_list,revenue_finder(i))
}
revenue_df$revenue <- c(revenue_list)
print(revenue_df)
# ggplot:
publisher_revenue <- revenue_df %>%
ggplot(data = revenue_df, mapping = aes(x = revenue_df$publisher, y = revenue_df$revenue), fill = revenue_df$revenue) +
geom_col() +
labs(title = "Total Revenue for Publishers", x = "Top 25% of Publishers", y = "Revenue (Euro)", color = 'Types of OA') +
theme(axis.text = element_text(size = rel(0.75))) +
scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
scale_fill_brewer(palette = "Set1")
ggplotly(publisher_revenue)
Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.Use of `revenue_df$revenue` is discouraged. Use `revenue` instead.
Idea: Revenue vs. OA
Question: What revenue are publishers receiving per year?
Observation:
# Finding total revenue for each publisher
revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()
revlist <- c()
for (name in top25_list) {
pub_name <- filter(filtered, filtered$publisher == name)
rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
revlist_2017 <- c(revlist_2017, rev_2017)
rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
revlist_2018 <- c(revlist_2018, rev_2018)
rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
revlist_2019 <- c(revlist_2019, rev_2019)
}
revenue_df <- data.frame("publisher" = top25_list)
revenue_df$'2017' <- c(revlist_2017)
revenue_df$'2018' <- c(revlist_2018)
revenue_df$'2019' <- c(revlist_2019)
print(revenue_df)
revenue_year <- c(revenue_df$'2017', revenue_df$'2018', revenue_df$'2019')
year <- c('2017', '2018', '2019')
# ggplot:
pub_year_revenue1 <- revenue_df %>%
ggplot(data = revenue_df, mapping = aes(x = '2017', y = revenue_df$'2017', fill = revenue_df$publisher)) +
geom_bar(position="dodge", stat="identity") +
labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
theme(axis.text = element_text(size = rel(0.75)))
ggplotly(pub_year_revenue1)
Use of `revenue_df$"2017"` is discouraged. Use `2017` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.
pub_year_revenue2 <- revenue_df %>%
ggplot(data = revenue_df, mapping = aes(x = '2018', y = revenue_df$'2018', fill = revenue_df$publisher)) +
geom_bar(position="dodge", stat="identity") +
labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
theme(axis.text = element_text(size = rel(0.75)))
ggplotly(pub_year_revenue2)
Use of `revenue_df$"2018"` is discouraged. Use `2018` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.
pub_year_revenue3 <- revenue_df %>%
ggplot(data = revenue_df, mapping = aes(x = '2019', y = revenue_df$'2019', fill = revenue_df$publisher)) +
geom_bar(position="dodge", stat="identity") +
labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
theme(axis.text = element_text(size = rel(0.75)))
ggplotly(pub_year_revenue3)
Use of `revenue_df$"2019"` is discouraged. Use `2019` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.
Continued, tried putting it into one graph.
revlist <- c()
revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()
for (name in top25_list) {
pub_name <- filter(filtered, filtered$publisher == name)
rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
revlist_2017 <- c(revlist_2017, rev_2017)
rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
revlist_2018 <- c(revlist_2018, rev_2018)
rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
revlist_2019 <- c(revlist_2019, rev_2019)
}
revlist <- c(revlist_2017, revlist_2018, revlist_2019)
print(revlist)
[1] 127420 153491 43044 96849 51824 48131 58125 61875 54750 36000 53625 21375 51750 58125 36000 17625 40500 42375
nrev <- matrix(revlist, ncol=6, byrow=TRUE)
colnames(nrev) <- top25_list
rownames(nrev) <- c("2017", "2018", "2019")
nrev <- as.table(nrev)
nrev <- as.data.frame.matrix(nrev)
print(nrev)
#, nrev$`Duke University Press`, nrev$`University of Michigan Press`, nrev$`Pluto Press`, nrev$`Manchester University Press`, nrev$`Liverpool University Press`
pub_year_rev <- nrev %>%
ggplot(data = nrev, mapping = aes(x = c("2017", "2018", "2019"), y = c(nrev$"transcript Verlag"), fill = nrev$publisher)) +
geom_bar(position="dodge", stat="identity") +
labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
theme(axis.text = element_text(size = rel(0.75)))
ggplotly(pub_year_rev)
Use of `nrev$"transcript Verlag"` is discouraged. Use `transcript Verlag` instead.Use of `nrev$publisher` is discouraged. Use `publisher` instead.
Idea: DOAB analysis
Question: What is the average time gap between year of publication and added on date?
Observation:
DOABmeta.df <- filter(DOABmeta.df, is.na(DOABmeta.df$Year.of.publication))
print(DOABmeta.df$Year.of.publication[1:4])
[1] <NA> <NA> <NA> <NA>
694 Levels: ...
gap = mean(DOABmeta.df$Added.on.date - DOABmeta.df$Year.of.publication[1:3])
‘-’ not meaningful for factors
print(gap)
[1] NA
Comparison of charges by year and backlist
Interactive charges exploration
### Interactive Dataset Exploration
---
title: "Exploratory Analysis"
output: html_notebook
---

```{r}
#Required libraries

# Tidyverse for data science and exploration
require(dplyr)
require(tidyr)
require(readr)
require(tibble)
require(stringr)
require(purrr)
require(forcats)
require(rlang)

# enhances tidyverse
require(tidylog) # additional logging
require(magrittr) # additional data pipe syntax


# for reading data in multiple formats
require(readxl)
require(haven)

# visual analysis
require(ggplot2)
require(GGally) # extensions to ggplot
require(gt) # well formatted tables
# client-side interactive publishable graphics
require(plotly)
require(leaflet)
require(crosstalk)
require(htmlwidgets)
# server-side interactive graphics
require(shiny)
require(shinyjs)
# Canned Interactive EDA 
require(ExPanDaR)


```
## Exploring KU Book Processing Charges
```{r  }
# read KU data frame
KUbpc.df <- read_csv("Public Data/openapc-de/data/bpc.csv")
# read DOAB metadata

source('Public Data/DOAB/doabingest.R')
DOABmeta.df <- doabFetch()
```
```{r  }


head(KUbpc.df)
head(summary(KUbpc.df))

ggplot(data = KUbpc.df, aes(KUbpc.df$institution)) + geom_bar() 

ggplot(data = KUbpc.df, aes(KUbpc.df$euro)) + geom_histogram()

```
## General Exploratory Data Analysis
```{r  }

ggplot(data = KUbpc.df) + geom_bar(mapping = aes(x = KUbpc.df$doab))

# Date to Doab
date_doab <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$period, colour = KUbpc.df$doab)) + geom_freqpoly(binwidth = 0.1)
ggplotly(date_doab)

# publisher_euro <- KUbpc.df %>% 
# ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$publisher, colour = KUbpc.df$euro)) + geom_freqpoly(binwidth = 0.1)

# Institution to Euro
institution_euro <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$euro)) + geom_freqpoly(mapping = aes(colour = KUbpc.df$institution), binwidth = 500)

ggplotly(institution_euro)

```
## Idea: Publishers vs. Charges
## Question: How do the top 25% of publishers divide up charges (in Euro)?
## Observation: Charges are grouped around ~2000 Euros and ~8000 Euros. 
```{r  }

publisher_counts <- KUbpc.df %>%
    group_by(publisher) %>%
    tally

sorted_counts = arrange(publisher_counts, desc(n))

total_n = sum(sorted_counts$n)
quarter_n = 0.25 * total_n
new_n = sum(sorted_counts$n[0:6])

sorted_counts %>% filter(n > 24)

# filtered <- filter(KUbpc.df$publisher %in% sorted_counts$publisher)

filtered <- filter(KUbpc.df, KUbpc.df$publisher == 'transcript Verlag' |
                     KUbpc.df$publisher == 'Duke University Press' |
                     KUbpc.df$publisher == 'University of Michigan Press' |
                     KUbpc.df$publisher == 'Manchester University Press' |
                     KUbpc.df$publisher == 'Pluto Press' |
                     KUbpc.df$publisher == 'Liverpool University Press')

head(filtered)

euro_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, y = filtered$euro), 
         aes(x = filtered$publisher, y = filtered$euro)) + 
  # geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))

# ggplot:
ggplotly(euro_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$publisher, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)

# shared_euro_publisher <- SharedData$new(filtered)
# leaflet(shared_euro_publisher) %>% addMarkers()
# data.table::data.table(shared_euro_publisher)


```
## Idea: Publishers' Charges vs. Year/OA Type
## Sub-Question: What best explains the particular division of charges? (Year, OA Type)
## Observation: The low and high charge groups seem to be defined by the type of OA business model, whereas the slight differences within each group seem to be defined by the year. 
```{r  }

head(filtered)

# Does Type of OA impact the particular division of charges?

euro_oa_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$backlist_oa, y = filtered$euro), 
         aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How OA Impacts Price Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_oa_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How OA Impacts Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)


# Does Year impact the particular division of charges?

euro_year_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$period, y = filtered$euro), 
         aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Year Impacts Price Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_year_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Year Impacts Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)


```
## Idea: Publishers vs. OA
## Question: What type of business model do the top 25% publishers use?
## Observation: Most have a higher proportion of True (moved to OA from traditional publishing) than False (already published OA).
```{r  }

oa_type <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, colour = filtered$backlist_oa), fill = filtered$backlist_oa) +
  geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_color_brewer(palette = "Set1")

ggplotly(oa_type)

# crosstalk:
ft <- highlight_key(filtered)
oa_ft <- ggplot(data = ft, mapping = aes(x = ft$publisher, colour = ft$backlist_oa), fill = ft$backlist_oa) +
  geom_bar(position = "fill", width = 0.7) +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion of Backlist OA", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
# cross_oa_ft <- bscols(
#   filter_select("publisher", "Select a publisher", ft, ~publisher),
#   ggplotly(oa_ft, dynamicTicks = TRUE),
#   # widths = c(12, 12)
# )

# bscols(cross_oa_ft)


```
## Idea: Publishers' OA vs. Year
## Question: Did OA business models of the top 25% publishers change per year?
## Observation:
```{r  }

oa_time <- function(pub_name) {
  pub_ft <- filter(filtered, filtered$publisher == pub_name)
  
  pub_oa <- pub_ft %>% 
    ggplot(data = pub_ft, mapping = aes(x = pub_ft$period, colour = pub_ft$backlist_oa), fill = pub_ft$backlist_oa) +
    geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
    labs(title = paste(pub_name, "'s OA Through the Years", sep = ""), 
         x = "Years", y = "Proportion of Backlist OA", color = 'Types of OA') +
    theme(axis.text = element_text(size = rel(0.75))) +
    scale_x_discrete(limits=c(2017, 2018, 2019)) +
    scale_color_brewer(palette = "Set1")

  ggplotly(pub_oa)
  
}

top25_list = c("transcript Verlag", "Duke University Press", "University of Michigan Press", "Manchester University Press", "Pluto Press", "Liverpool University Press")

oa_time("transcript Verlag")

oa_time("Duke University Press")

oa_time("University of Michigan Press")

oa_time("Manchester University Press")

oa_time("Pluto Press")

oa_time("Liverpool University Press")

```
## Idea: Revenue vs. OA
## Question: What total revenue are publishers receiving each year?
## Observation: 
```{r  }

# Finding total revenue for each publisher

revenue_finder <- function(pub_name) {
  pub_filtered <- filter(filtered, filtered$publisher == pub_name)
  rev = sum(pub_filtered$euro)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_list <- c()

for (i in top25_list) {
  revenue_list<-c(revenue_list,revenue_finder(i))
}

revenue_df$revenue <- c(revenue_list)
print(revenue_df)

# ggplot:
publisher_revenue <- revenue_df %>%
  ggplot(data = revenue_df, mapping = aes(x = revenue_df$publisher, y = revenue_df$revenue), fill = revenue_df$revenue) +
  geom_col() +
  labs(title = "Total Revenue for Publishers", x = "Top 25% of Publishers", y = "Revenue (Euro)", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_fill_brewer(palette = "Set1")

ggplotly(publisher_revenue)


```
## Idea: Revenue vs. OA
## Question: What revenue are publishers receiving per year?
## Observation: 
```{r  }

# Finding total revenue for each publisher

revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

revlist <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro) 
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_df$'2017' <- c(revlist_2017)
revenue_df$'2018' <- c(revlist_2018)
revenue_df$'2019' <- c(revlist_2019)

print(revenue_df)

revenue_year <- c(revenue_df$'2017', revenue_df$'2018', revenue_df$'2019')
year <- c('2017', '2018', '2019')

# ggplot:
pub_year_revenue1 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2017', y = revenue_df$'2017', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue1)

pub_year_revenue2 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2018', y = revenue_df$'2018', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue2)

pub_year_revenue3 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2019', y = revenue_df$'2019', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue3)


```
### Continued, tried putting it into one graph. 
```{r}

revlist <- c()
revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revlist <- c(revlist_2017, revlist_2018, revlist_2019)

print(revlist)

nrev <- matrix(revlist, ncol=6, byrow=TRUE)
colnames(nrev) <- top25_list
rownames(nrev) <- c("2017", "2018", "2019")
nrev <- as.table(nrev)
nrev <- as.data.frame.matrix(nrev)

print(nrev)

#, nrev$`Duke University Press`, nrev$`University of Michigan Press`, nrev$`Pluto Press`, nrev$`Manchester University Press`, nrev$`Liverpool University Press`

pub_year_rev <- nrev %>%
  
  ggplot(data = nrev, mapping = aes(x = c("2017", "2018", "2019"), y = c(nrev$"transcript Verlag"), fill = nrev$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_rev)

```
## Idea: DOAB analysis
## Question: What is the average time gap between year of publication and added on date? 
## Observation: 
```{r}

DOABmeta.df <- filter(DOABmeta.df, is.na(DOABmeta.df$Year.of.publication))
print(DOABmeta.df$Year.of.publication[1:4])
gap = mean(DOABmeta.df$Added.on.date - DOABmeta.df$Year.of.publication[1:3])
print(gap)

```
### Comparison of charges by year and backlist
```{r}
# create faceted plot object
charges.plot <- KUbpc.df %>% ggplot(aes(euro))+geom_histogram(bins=6)+facet_grid(rows=vars(period), cols = vars(backlist_oa))


## Present as Standard plot
 plot(charges.plot)

# this plot will render publicly https://htmlpreview.github.io/?https://github.com/MIT-Informatics/monograph/blob/master/00%20EDA%20Start.nb.html

```
### Interactive charges exploration
```{r}
 ggplotly(charges.plot)
# https://mit-informatics.github.io/monograph/demo.html

```
```
### Interactive Dataset Exploration 
```
```{r}
KUbpc.df %>% ExPanD(df=.       ,title="KU Book Processing Charges",export_nb_option = TRUE)
# ExPanD uses shiny() which works running R locally, but isn't going to work through github. Could publish through shinyapps.io (low usage only), or export  a non-interactive notebook it
# see: https://drmaltman.shinyapps.io/demo/
```

